import pandas as pd
import plotly as plt
import numpy as np
pd.options.display.max_rows = 4000
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import plotly.express as px
import plotly.subplots as tls
import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV, StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
SEED = 42
df_final = pd.read_csv('out2.zip')
df_final = df_final.drop(['goal', 'pledged', 'usd pledged'], axis = 1)
df_final.describe(include='all', datetime_is_numeric=True)
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.818530e+05 | 281853 | 281853 | 281853 | 281853 | 281853 | 281853 | 281853 | 281853.000000 | 281643 | 2.818530e+05 | 2.818530e+05 |
| unique | NaN | 280147 | 158 | 15 | 13 | 257784 | 281499 | 2 | NaN | 21 | NaN | NaN |
| top | NaN | New EP/Music Development | Product Design | Film & Video | USD | 2012-01-01 05:59:00 | 2015-05-26 18:00:38 | failed | NaN | US | NaN | NaN |
| freq | NaN | 15 | 14540 | 51130 | 229975 | 46 | 2 | 168605 | NaN | 229824 | NaN | NaN |
| mean | 1.074956e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 111.852530 | NaN | 9.293935e+03 | 4.115817e+04 |
| std | 6.194665e+08 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 996.270201 | NaN | 9.314645e+04 | 1.082761e+06 |
| min | 5.971000e+03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 | NaN | 0.000000e+00 | 1.000000e-02 |
| 25% | 5.373183e+08 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.000000 | NaN | 5.000000e+01 | 2.000000e+03 |
| 50% | 1.075897e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 15.000000 | NaN | 7.914802e+02 | 5.000000e+03 |
| 75% | 1.611414e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 62.000000 | NaN | 4.500000e+03 | 1.500000e+04 |
| max | 2.147476e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 219382.000000 | NaN | 2.033899e+07 | 1.663627e+08 |
# Converting the columns into the right dtypes as for dates and numbers.
df_final["deadline"] = pd.to_datetime(df_final['deadline'])
df_final["launched"] = pd.to_datetime(df_final['launched'])
df_final["ID"] = pd.to_numeric(df_final["ID"])
df_final["backers"] = pd.to_numeric(df_final["backers"])
df_final["real_usd_pledged"] = pd.to_numeric(df_final["real_usd_pledged"])
df_final["usd_goal"] = pd.to_numeric(df_final["usd_goal"])
df_final.dtypes
ID int64 name object category object main_category object currency object deadline datetime64[ns] launched datetime64[ns] state object backers int64 country object real_usd_pledged float64 usd_goal float64 dtype: object
df_final.isnull().any()
ID False name False category False main_category False currency False deadline False launched False state False backers False country True real_usd_pledged False usd_goal False dtype: bool
df_final[df_final['country'].isnull()].head()
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1833 | 1012744036 | An Oratorio for our Time - Last Stop Cafe | Music | Music | USD | 2016-05-26 19:46:26 | 2016-04-26 19:46:26 | successful | 0 | NaN | 5170.00 | 5000.0 |
| 3670 | 1024989802 | Jackson Ruby: The Cassette Album | Music | Music | USD | 2016-05-11 19:29:17 | 2016-04-11 19:29:17 | successful | 0 | NaN | 5296.00 | 5000.0 |
| 4004 | 1027275369 | Help Parker Brown make his first solo album | Music | Music | USD | 2016-05-21 16:14:28 | 2016-04-21 16:14:28 | successful | 0 | NaN | 5077.00 | 3800.0 |
| 4180 | 1028691308 | Help BETHANY record a NEW single in Nash! | Music | Music | USD | 2016-05-08 22:37:00 | 2016-04-04 23:32:00 | successful | 0 | NaN | 3502.13 | 3500.0 |
| 6157 | 1041708793 | Serena Gabriel's first CD!!!!!!: Diving Deep | Music | Music | USD | 2016-05-01 04:22:00 | 2016-03-25 17:20:21 | successful | 0 | NaN | 3787.00 | 3500.0 |
df_final[df_final['country'].isnull()].shape
(210, 12)
Let's drop these because we can see that there is 0 backers and no country nor usd pledged previously, it seems to be a mistake in getting the data
df_final = df_final[~df_final['country'].isnull()]
df_final = df_final.loc[~((df_final['real_usd_pledged']>=df_final['usd_goal']) & (df_final['state']=='failed'))]
df_final = df_final.reset_index(drop=True)
df_final.isnull().any()
ID False name False category False main_category False currency False deadline False launched False state False backers False country False real_usd_pledged False usd_goal False dtype: bool
df_final.shape
(281637, 12)
df_final.duplicated().sum()
0
counts = df_final['name'].value_counts().rename_axis('name').reset_index(name='counts')
duplicate_names = df_final[df_final['name'].isin(counts[counts['counts']>1].name.tolist())]
duplicate_names.shape
(3022, 12)
duplicate_names.sort_values(by=['name']).head()
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1529 | 1010584633 | "A Fresh Start" | Shorts | Film & Video | USD | 2011-09-25 06:00:00 | 2011-08-28 21:27:52 | successful | 24 | US | 3000.0 | 3000.0 |
| 240126 | 713417995 | "A Fresh Start" | Documentary | Film & Video | USD | 2013-03-27 02:51:47 | 2013-01-26 03:51:47 | failed | 26 | US | 1417.0 | 5000.0 |
| 96268 | 1656736114 | "ONE" | Apps | Technology | USD | 2016-02-25 17:37:04 | 2016-01-26 17:37:04 | failed | 0 | US | 0.0 | 10000.0 |
| 86497 | 159049492 | "ONE" | Classical Music | Music | USD | 2016-09-26 03:19:12 | 2016-08-12 03:19:12 | successful | 113 | US | 10261.0 | 10000.0 |
| 281045 | 996180421 | "On The Road" | Webseries | Film & Video | USD | 2015-01-22 17:31:06 | 2014-12-23 17:31:06 | failed | 0 | US | 0.0 | 80000.0 |
I'll leave it as it is, but it's interesting to see that some duplicates seem genuine, others seem to be about the same project revamped/relaunched and others are also another rendition of the same project (play at theater and video for instance...).
It would be interesting to know more about the motives and mindset of people creating these projects 'again' (needs of funds again), are there also possible cases of reboot of past successful projects (hoax ?).
Overall, it still can be integrated in our model as we want to predict the success/failure of a campaign regardless.
def plot_continuous_vars(data, column_name):
plot_dims = (14, 8)
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize=plot_dims)
sns.distplot(data[column_name], ax=ax1)
sns.distplot(np.log1p(data[column_name]), ax=ax2)
plot_continuous_vars(df_final, 'usd_goal')
C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
plot_continuous_vars(df_final, 'real_usd_pledged')
C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
We take the log to better see the distributions as we have outliers in both cases.
df_failed = df_final[df_final["state"] == "failed"]
df_sucess = df_final[df_final["state"] == "successful"]
# Add histogram data
failed = np.log(df_failed['usd_goal']+1)
success = np.log(df_sucess['usd_goal']+1)
trace1 = go.Histogram(
x=failed,
opacity=0.60, nbinsx=30, name='Goals Failed', histnorm='probability'
)
trace2 = go.Histogram(
x=success,
opacity=0.60, nbinsx=30, name='Goals Sucessful', histnorm='probability'
)
data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title=go.layout.Title(text="Distributions of usd_goal"))
fig = go.Figure(
data=data,
layout=layout
)
iplot(fig)
Based on the above histogram, it seems the failed projects tend to have higher values (so higher goals)
import plotly.express as px
fig = px.box(df_final, x="main_category", y="usd_goal")
fig.show()
df_failed = df_final[df_final["state"] == "failed"]
df_success = df_final[df_final["state"] == "successful"]
plot_continuous_vars(df_failed, 'backers')
plot_continuous_vars(df_success, 'backers')
C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
Variables for the logistic regression:
Others
to predict target variable state
def getDelta(a,b):
'''Get diffence in days between launch and deadline'''
return (a - b).days
# Duration of the project
df_final['duration'] = df_final.apply(lambda x: getDelta(x['deadline'],x['launched']),axis = 1)
df_final['month'] = df_final['launched'].dt.month
df_final['year_month'] = df_final['launched'].map(lambda x: str(x.year) + "-" + str(x.month))
import re
def has_non_chars(name):
for c in name:
if not c.isalpha() and c!='?' and c!='!':
return 1
return 0
def has_exclamation_interrogation(name):
if ("!" in name or "?" in name):
return 1
return 0
def has_upper(name):
for word in name.split(' '):
if word.isupper() and len(re.sub(r'\W+', '', word))>1:
return 1
return 0
df_final['len_name'] = df_final.name.str.len()
df_final['name_nb_words'] = df_final.name.apply(lambda x: len(str(x).split(' ')))
df_final['name_non_chars'] = df_final.name.apply(has_non_chars)
df_final['name_has_symbol'] = df_final.name.apply(has_exclamation_interrogation)
df_final['name_upper'] = df_final.name.apply(has_upper)
df_final['cat_full'] = df_final[["main_category","category"]].agg('-'.join, axis=1)
df_final.head()
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | ... | usd_goal | duration | month | year_month | len_name | name_nb_words | name_non_chars | name_has_symbol | name_upper | cat_full | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000002330 | The Songs of Adelaide & Abullah | Poetry | Publishing | GBP | 2015-10-09 11:36:00 | 2015-08-11 12:12:28 | failed | 0 | GB | ... | 1533.954368 | 58 | 8 | 2015-8 | 31 | 6 | 1 | 0 | 0 | Publishing-Poetry |
| 1 | 1000004038 | Where is Hank? | Narrative Film | Film & Video | USD | 2013-02-26 00:20:50 | 2013-01-12 00:20:50 | failed | 3 | US | ... | 45000.000000 | 45 | 1 | 2013-1 | 14 | 3 | 1 | 1 | 0 | Film & Video-Narrative Film |
| 2 | 1000007540 | ToshiCapital Rekordz Needs Help to Complete Album | Music | Music | USD | 2012-04-16 04:24:11 | 2012-03-17 03:24:11 | failed | 1 | US | ... | 5000.000000 | 30 | 3 | 2012-3 | 49 | 7 | 1 | 0 | 0 | Music-Music |
| 3 | 1000014025 | Monarch Espresso Bar | Restaurants | Food | USD | 2016-04-01 13:38:27 | 2016-02-26 13:38:27 | successful | 224 | US | ... | 50000.000000 | 35 | 2 | 2016-2 | 20 | 3 | 1 | 0 | 0 | Food-Restaurants |
| 4 | 1000023410 | Support Solar Roasted Coffee & Green Energy! ... | Food | Food | USD | 2014-12-21 18:30:44 | 2014-12-01 18:30:44 | successful | 16 | US | ... | 1000.000000 | 20 | 12 | 2014-12 | 60 | 9 | 1 | 1 | 0 | Food-Food |
5 rows × 21 columns
df_final.columns
Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
'launched', 'state', 'backers', 'country', 'real_usd_pledged',
'usd_goal', 'duration', 'month', 'year_month', 'len_name',
'name_nb_words', 'name_non_chars', 'name_has_symbol', 'name_upper',
'cat_full'],
dtype='object')
ks = df_final.drop(['ID','name','deadline','launched','year_month', 'backers', 'real_usd_pledged'], axis=1).copy()
ks.columns
Index(['category', 'main_category', 'currency', 'state', 'country', 'usd_goal',
'duration', 'month', 'len_name', 'name_nb_words', 'name_non_chars',
'name_has_symbol', 'name_upper', 'cat_full'],
dtype='object')
usd_goal is skewed, let's check the distribution here, let's replace it.
ks['usd_goal_corrected'] = np.log1p(ks['usd_goal'])
ks['state'] = ks.state.map(dict(successful=1, failed=0))
profile = ProfileReport(ks, title="Pandas Profiling Report Kickstarter")
profile.to_file('kickstarterds.html')
# ## This heatmap is also available from pandas-profiling html file.
# corr = ks.corr()
# dims = (16, 10)
# fig, ax = plt.subplots(figsize = dims)
# sns.heatmap(corr,
# xticklabels=corr.columns.values,
# yticklabels=corr.columns.values,ax = ax, cmap="Blues")
# We'll drop name_nb_words because it's highly correlated with len_name
ks = ks.drop(['name_nb_words'], axis=1)
# We can drop currency too as the currency is explained by the country
ks = ks.drop(['currency'], axis=1)
# We can drop category and main_category as it's encoded in cat_full
ks = ks.drop(['category','main_category'], axis=1)
ks.columns
Index(['state', 'country', 'usd_goal', 'duration', 'month', 'len_name',
'name_non_chars', 'name_has_symbol', 'name_upper', 'cat_full',
'usd_goal_corrected'],
dtype='object')
ks.state.value_counts(normalize=True)
0 0.598267 1 0.401733 Name: state, dtype: float64
We may consider the dataset is balanced because of the 60/40 % ratio
ks.dtypes
state int64 country object usd_goal float64 duration int64 month int64 len_name int64 name_non_chars int64 name_has_symbol int64 name_upper int64 cat_full object usd_goal_corrected float64 dtype: object
ks.describe(include='all')
| state | country | usd_goal | duration | month | len_name | name_non_chars | name_has_symbol | name_upper | cat_full | usd_goal_corrected | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 281637.000000 | 281637 | 2.816370e+05 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637 | 281637.000000 |
| unique | NaN | 21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 165 | NaN |
| top | NaN | US | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Design-Product Design | NaN |
| freq | NaN | 229818 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14538 | NaN |
| mean | 0.401733 | NaN | 4.117401e+04 | 33.976253 | 6.372977 | 34.023541 | 0.963318 | 0.105174 | 0.207533 | NaN | 8.587772 |
| std | 0.490249 | NaN | 1.083168e+06 | 12.893991 | 3.274156 | 15.917254 | 0.187980 | 0.306779 | 0.405541 | NaN | 1.659265 |
| min | 0.000000 | NaN | 1.000000e-02 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.009950 |
| 25% | 0.000000 | NaN | 2.000000e+03 | 30.000000 | 4.000000 | 20.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 7.601402 |
| 50% | 0.000000 | NaN | 5.000000e+03 | 30.000000 | 6.000000 | 33.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 8.517393 |
| 75% | 1.000000 | NaN | 1.500000e+04 | 36.000000 | 9.000000 | 48.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 9.615872 |
| max | 1.000000 | NaN | 1.663627e+08 | 92.000000 | 12.000000 | 85.000000 | 1.000000 | 1.000000 | 1.000000 | NaN | 18.929681 |
y = ks.state
x = ks.drop(['state','usd_goal'], axis = 1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=SEED)
print('x_train.shape:', x_train.shape)
print('y_train.shape:', y_train.shape)
print('x_test.shape :', x_test.shape)
print('y_test.shape :', y_test.shape)
x_train.shape: (225309, 9) y_train.shape: (225309,) x_test.shape : (56328, 9) y_test.shape : (56328,)
x_train.columns
Index(['country', 'duration', 'month', 'len_name', 'name_non_chars',
'name_has_symbol', 'name_upper', 'cat_full', 'usd_goal_corrected'],
dtype='object')
from pprint import pprint
import mlflow
def fetch_logged_data(run_id):
client = mlflow.tracking.MlflowClient()
data = client.get_run(run_id).data
tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
return data.params, data.metrics, tags, artifacts
# enable autologging
mlflow.sklearn.autolog()
from sklearn.dummy import DummyClassifier
# define model
model = DummyClassifier(strategy='uniform', random_state=42)
with mlflow.start_run() as run:
model.fit(x_train, y_train)
# fetch logged data
params, metrics, tags, artifacts = fetch_logged_data(run.info.run_id)
pprint(params)
pprint(metrics)
pprint(tags)
pprint(artifacts)
2021/06/17 06:44:25 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "C:\Users\ymaricar\anaconda3\lib\site-packages\mlflow\models\signature.py:127: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
{'constant': 'None', 'random_state': '42', 'strategy': 'uniform'}
{'training_accuracy_score': 0.500264081772144,
'training_f1_score': 0.5051236324432034,
'training_log_loss': 0.6931471805599451,
'training_precision_score': 0.5195800439022963,
'training_recall_score': 0.500264081772144,
'training_roc_auc_score': 0.5,
'training_score': 0.500264081772144}
{'estimator_class': 'sklearn.dummy.DummyClassifier',
'estimator_name': 'DummyClassifier'}
['model/MLmodel', 'model/conda.yaml', 'model/model.pkl']
x_train.columns
Index(['country', 'duration', 'month', 'len_name', 'name_non_chars',
'name_has_symbol', 'name_upper', 'cat_full', 'usd_goal_corrected'],
dtype='object')
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
numeric_features = ['usd_goal_corrected', 'duration', 'len_name']
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="mean")),
("scaler", RobustScaler()),
]
)
categorical_features = ['country', 'cat_full', 'month', 'name_non_chars', 'name_has_symbol', 'name_upper']
categorical_transformer = Pipeline(
steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)
preprocessing = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
def predict_plot(X, y, classifier, classifier_name):
# # predict probabilities
# lr_probs = classifier.predict_proba(X)
# # keep probabilities for the positive outcome only
# lr_probs = lr_probs[:, 1]
# # predict class values
# yhat = classifier.predict(X)
# precision = precision_score(y, yhat)
# lr_precision, lr_recall, _ = precision_recall_curve(y, lr_probs)
# lr_f1, lr_auc = f1_score(y, yhat), auc(lr_recall, lr_precision)
# # summarize scores
# print(classifier_name+': precision=%.3f auc=%.3f' % (precision, lr_auc))
# # plot the precision-recall curves
# no_skill = len(y[y==1]) / len(y)
# pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
# pyplot.plot(lr_recall, lr_precision, marker='.', label=classifier_name)
# # axis labels
# pyplot.xlabel('Recall')
# pyplot.ylabel('Precision')
# # show the legend
# pyplot.legend()
# # show the plot
# pyplot.show()
yhat = classifier.predict(X)
# Compute fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y, yhat)
roc_auc = roc_auc_score(y, yhat)
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
# precision-recall curve and f1 for evaluation purposes
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import auc
from matplotlib import pyplot
lr = LogisticRegression(solver='liblinear')
# model = LogisticRegression(solver='lbfgs')
model = Pipeline([('preprocessing', preprocessing),
('lr',lr)])
model.fit(x_train, y_train)
predict_plot(x_train, y_train, model, "Logistic Regression")
2021/06/17 06:46:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8b0fe6f58a264041b86e8d535d7777ac', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2021/06/17 06:46:31 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:46:31 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:46:31 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:46:37 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "C:\Users\ymaricar\anaconda3\lib\site-packages\mlflow\models\signature.py:127: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
predict_plot(x_test, y_test, model, "Logistic Regression")
from lightgbm import LGBMClassifier
# #Specifying the parameter
# params={}
# params['learning_rate']=0.03
# params['boosting_type']='gbdt' #GradientBoostingDecisionTree
# params['objective']='binary' #Binary target feature
# params['metric']='binary_logloss' #metric for binary classification
# params['max_depth']=10
# #train the model
# clf=lgb.train(params,d_train,100) #train the model on 100 epochs
# #prediction on the test set
# y_pred=clf.predict(X_test)
clf = make_pipeline(
preprocessing,
LGBMClassifier()
)
clf.fit(x_train, y_train)
predict_plot(x_train, y_train, clf, 'GBM')
2021/06/17 06:46:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd5f5e1721fa5494980344f3f8f7ab27c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2021/06/17 06:46:42 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('columntransformer', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Ro...`
2021/06/17 06:46:42 WARNING mlflow.utils: Truncated the value of the key `columntransformer`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:46:42 WARNING mlflow.utils: Truncated the value of the key `columntransformer__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:46:49 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "C:\Users\ymaricar\anaconda3\lib\site-packages\mlflow\models\signature.py:127: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
predict_plot(x_test, y_test, clf, 'GBM')
make_pipeline(preprocessing, clf).get_params()
{'memory': None,
'steps': [('columntransformer',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
['usd_goal_corrected', 'duration',
'len_name']),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['country', 'cat_full', 'month',
'name_non_chars', 'name_has_symbol',
'name_upper'])])),
('pipeline',
Pipeline(steps=[('columntransformer',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
RobustScaler())]),
['usd_goal_corrected',
'duration', 'len_name']),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['country', 'cat_full',
'month', 'name_non_chars',
'name_has_symbol',
'name_upper'])])),
('lgbmclassifier', LGBMClassifier())]))],
'verbose': False,
'columntransformer': ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
['usd_goal_corrected', 'duration',
'len_name']),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['country', 'cat_full', 'month',
'name_non_chars', 'name_has_symbol',
'name_upper'])]),
'pipeline': Pipeline(steps=[('columntransformer',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
RobustScaler())]),
['usd_goal_corrected',
'duration', 'len_name']),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['country', 'cat_full',
'month', 'name_non_chars',
'name_has_symbol',
'name_upper'])])),
('lgbmclassifier', LGBMClassifier())]),
'columntransformer__n_jobs': None,
'columntransformer__remainder': 'passthrough',
'columntransformer__sparse_threshold': 0.3,
'columntransformer__transformer_weights': None,
'columntransformer__transformers': [('num',
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]),
['usd_goal_corrected', 'duration', 'len_name']),
('cat',
Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]),
['country',
'cat_full',
'month',
'name_non_chars',
'name_has_symbol',
'name_upper'])],
'columntransformer__verbose': False,
'columntransformer__num': Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]),
'columntransformer__cat': Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]),
'columntransformer__num__memory': None,
'columntransformer__num__steps': [('imputer', SimpleImputer()),
('scaler', RobustScaler())],
'columntransformer__num__verbose': False,
'columntransformer__num__imputer': SimpleImputer(),
'columntransformer__num__scaler': RobustScaler(),
'columntransformer__num__imputer__add_indicator': False,
'columntransformer__num__imputer__copy': True,
'columntransformer__num__imputer__fill_value': None,
'columntransformer__num__imputer__missing_values': nan,
'columntransformer__num__imputer__strategy': 'mean',
'columntransformer__num__imputer__verbose': 0,
'columntransformer__num__scaler__copy': True,
'columntransformer__num__scaler__quantile_range': (25.0, 75.0),
'columntransformer__num__scaler__with_centering': True,
'columntransformer__num__scaler__with_scaling': True,
'columntransformer__cat__memory': None,
'columntransformer__cat__steps': [('onehot',
OneHotEncoder(handle_unknown='ignore'))],
'columntransformer__cat__verbose': False,
'columntransformer__cat__onehot': OneHotEncoder(handle_unknown='ignore'),
'columntransformer__cat__onehot__categories': 'auto',
'columntransformer__cat__onehot__drop': None,
'columntransformer__cat__onehot__dtype': numpy.float64,
'columntransformer__cat__onehot__handle_unknown': 'ignore',
'columntransformer__cat__onehot__sparse': True,
'pipeline__memory': None,
'pipeline__steps': [('columntransformer',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
['usd_goal_corrected', 'duration',
'len_name']),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['country', 'cat_full', 'month',
'name_non_chars', 'name_has_symbol',
'name_upper'])])),
('lgbmclassifier', LGBMClassifier())],
'pipeline__verbose': False,
'pipeline__columntransformer': ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
['usd_goal_corrected', 'duration',
'len_name']),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['country', 'cat_full', 'month',
'name_non_chars', 'name_has_symbol',
'name_upper'])]),
'pipeline__lgbmclassifier': LGBMClassifier(),
'pipeline__columntransformer__n_jobs': None,
'pipeline__columntransformer__remainder': 'passthrough',
'pipeline__columntransformer__sparse_threshold': 0.3,
'pipeline__columntransformer__transformer_weights': None,
'pipeline__columntransformer__transformers': [('num',
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]),
['usd_goal_corrected', 'duration', 'len_name']),
('cat',
Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]),
['country',
'cat_full',
'month',
'name_non_chars',
'name_has_symbol',
'name_upper'])],
'pipeline__columntransformer__verbose': False,
'pipeline__columntransformer__num': Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]),
'pipeline__columntransformer__cat': Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]),
'pipeline__columntransformer__num__memory': None,
'pipeline__columntransformer__num__steps': [('imputer', SimpleImputer()),
('scaler', RobustScaler())],
'pipeline__columntransformer__num__verbose': False,
'pipeline__columntransformer__num__imputer': SimpleImputer(),
'pipeline__columntransformer__num__scaler': RobustScaler(),
'pipeline__columntransformer__num__imputer__add_indicator': False,
'pipeline__columntransformer__num__imputer__copy': True,
'pipeline__columntransformer__num__imputer__fill_value': None,
'pipeline__columntransformer__num__imputer__missing_values': nan,
'pipeline__columntransformer__num__imputer__strategy': 'mean',
'pipeline__columntransformer__num__imputer__verbose': 0,
'pipeline__columntransformer__num__scaler__copy': True,
'pipeline__columntransformer__num__scaler__quantile_range': (25.0, 75.0),
'pipeline__columntransformer__num__scaler__with_centering': True,
'pipeline__columntransformer__num__scaler__with_scaling': True,
'pipeline__columntransformer__cat__memory': None,
'pipeline__columntransformer__cat__steps': [('onehot',
OneHotEncoder(handle_unknown='ignore'))],
'pipeline__columntransformer__cat__verbose': False,
'pipeline__columntransformer__cat__onehot': OneHotEncoder(handle_unknown='ignore'),
'pipeline__columntransformer__cat__onehot__categories': 'auto',
'pipeline__columntransformer__cat__onehot__drop': None,
'pipeline__columntransformer__cat__onehot__dtype': numpy.float64,
'pipeline__columntransformer__cat__onehot__handle_unknown': 'ignore',
'pipeline__columntransformer__cat__onehot__sparse': True,
'pipeline__lgbmclassifier__boosting_type': 'gbdt',
'pipeline__lgbmclassifier__class_weight': None,
'pipeline__lgbmclassifier__colsample_bytree': 1.0,
'pipeline__lgbmclassifier__importance_type': 'split',
'pipeline__lgbmclassifier__learning_rate': 0.1,
'pipeline__lgbmclassifier__max_depth': -1,
'pipeline__lgbmclassifier__min_child_samples': 20,
'pipeline__lgbmclassifier__min_child_weight': 0.001,
'pipeline__lgbmclassifier__min_split_gain': 0.0,
'pipeline__lgbmclassifier__n_estimators': 100,
'pipeline__lgbmclassifier__n_jobs': -1,
'pipeline__lgbmclassifier__num_leaves': 31,
'pipeline__lgbmclassifier__objective': None,
'pipeline__lgbmclassifier__random_state': None,
'pipeline__lgbmclassifier__reg_alpha': 0.0,
'pipeline__lgbmclassifier__reg_lambda': 0.0,
'pipeline__lgbmclassifier__silent': True,
'pipeline__lgbmclassifier__subsample': 1.0,
'pipeline__lgbmclassifier__subsample_for_bin': 200000,
'pipeline__lgbmclassifier__subsample_freq': 0}
clf = make_pipeline(
preprocessing,
LGBMClassifier(learning_rate=0.7, boosting_type="gbdt", objective='binary', metric='accuracy', max_depth=-1)
)
clf.fit(x_train, y_train)
predict_plot(x_train, y_train, clf, 'GBM')
2021/06/17 06:47:11 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c45ead5f7eee4144b07aa0af4064a539', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2021/06/17 06:47:11 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('columntransformer', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Ro...`
2021/06/17 06:47:11 WARNING mlflow.utils: Truncated the value of the key `columntransformer`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:47:11 WARNING mlflow.utils: Truncated the value of the key `columntransformer__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:47:16 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "C:\Users\ymaricar\anaconda3\lib\site-packages\mlflow\models\signature.py:127: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
predict_plot(x_test, y_test, clf, 'GBM')
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = LogisticRegression(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
pipeline = Pipeline([('preprocessing', preprocessing), ('clf', ClfSwitcher())])
parameters = [
{
'clf__estimator': [LogisticRegression()],
'clf__estimator__solver': ["lbfgs", "liblinear"],
"clf__estimator__penalty": ["l2"],
"clf__estimator__C": [0.1, 0.2, 0.3, 0.5, 1.0],
"clf__estimator__max_iter": [100, 1000, 2000],
},
{
'clf__estimator': [LogisticRegression()],
'clf__estimator__solver': ["liblinear"],
"clf__estimator__penalty": ["l1"],
"clf__estimator__C": [0.1, 0.2, 0.3, 0.5, 1.0],
"clf__estimator__max_iter": [100, 1000, 2000],
},
]
gscv = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=3)
gs_model = gscv.fit(x_train, y_train)
print(gs_model.best_params_, gs_model.best_score_)
2021/06/17 06:47:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6faf78cd177345cfb00eb2961575bb43', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2021/06/17 06:47:26 WARNING mlflow.utils: Truncated the value of the key `estimator`. Truncated value: `Pipeline(steps=[('preprocessing',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
...`
2021/06/17 06:47:26 WARNING mlflow.utils: Truncated the value of the key `param_grid`. Truncated value: `[{'clf__estimator': [LogisticRegression()], 'clf__estimator__solver': ['lbfgs', 'liblinear'], 'clf__estimator__penalty': ['l2'], 'clf__estimator__C': [0.1, 0.2, 0.3, 0.5, 1.0], 'clf__estimator__max_iter': [100, 1000, 2000]}, {'clf__estimator': [Lo...`
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
Fitting 2 folds for each of 45 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Done 8 tasks | elapsed: 7.4s
[Parallel(n_jobs=-1)]: Done 90 out of 90 | elapsed: 2.8min finished
C:\Users\ymaricar\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
2021/06/17 06:50:17 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "C:\Users\ymaricar\anaconda3\lib\site-packages\mlflow\models\signature.py:127: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:18 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:19 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:20 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:21 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:22 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:23 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:24 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('preprocessing', ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', Robust...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `preprocessing`. Truncated value: `ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', RobustScaler())]),
...`
2021/06/17 06:50:25 WARNING mlflow.utils: Truncated the value of the key `preprocessing__transformers`. Truncated value: `[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', RobustScaler())]), ['usd_goal_corrected', 'duration', 'len_name']), ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['country', 'cat_full', 'month', 'n...`
{'clf__estimator': LogisticRegression(C=0.3), 'clf__estimator__C': 0.3, 'clf__estimator__max_iter': 100, 'clf__estimator__penalty': 'l2', 'clf__estimator__solver': 'lbfgs'} 0.6767994140970223
# x = pd.get_dummies(x, columns = ['month','category','main_category','country'])
x = pd.get_dummies(x, columns = ['month','cat_full','country'])
from sklearn.preprocessing import RobustScaler
num_cols = ['usd_goal_corrected', 'duration', 'len_name']
transformer = RobustScaler().fit(x[num_cols])
x[num_cols] = transformer.transform(x[num_cols])
x.describe()
| duration | len_name | name_non_chars | name_has_symbol | name_upper | usd_goal_corrected | month_1 | month_2 | month_3 | month_4 | ... | country_IE | country_IT | country_LU | country_MX | country_NL | country_NO | country_NZ | country_SE | country_SG | country_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | ... | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 | 281637.000000 |
| mean | 0.662709 | 0.036555 | 0.963318 | 0.105174 | 0.207533 | 0.034937 | 0.072846 | 0.080618 | 0.090038 | 0.087354 | ... | 0.001679 | 0.004950 | 0.000117 | 0.000014 | 0.006650 | 0.001527 | 0.003526 | 0.003785 | 0.000249 | 0.816008 |
| std | 2.148998 | 0.568473 | 0.187980 | 0.306779 | 0.405541 | 0.823673 | 0.259883 | 0.272248 | 0.286237 | 0.282353 | ... | 0.040947 | 0.070179 | 0.010824 | 0.003769 | 0.081279 | 0.039044 | 0.059274 | 0.061406 | 0.015763 | 0.387478 |
| min | -4.833333 | -1.142857 | 0.000000 | 0.000000 | 0.000000 | -4.223167 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | -0.464286 | 1.000000 | 0.000000 | 0.000000 | -0.454706 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 50% | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 75% | 1.000000 | 0.535714 | 1.000000 | 0.000000 | 0.000000 | 0.545294 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| max | 10.333333 | 1.857143 | 1.000000 | 1.000000 | 1.000000 | 5.168749 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 204 columns
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=SEED)
print('x_train.shape:', x_train.shape)
print('y_train.shape:', y_train.shape)
print('x_test.shape :', x_test.shape)
print('y_test.shape :', y_test.shape)
x_train.shape: (225309, 204) y_train.shape: (225309,) x_test.shape : (56328, 204) y_test.shape : (56328,)
# Creating the model:
lr = LogisticRegression(solver='liblinear')
# Training the model with the training datas:
lr.fit(x_train, y_train)
2021/06/17 06:50:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9ea0091684414f31985e5d317df363b0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow 2021/06/17 06:50:33 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "C:\Users\ymaricar\anaconda3\lib\site-packages\mlflow\models\signature.py:127: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
LogisticRegression(solver='liblinear')
y_pred_lr = lr.predict(x_test)
# test data set auc error
print('Train data ROC/AUC :', roc_auc_score(y_true=y_train, y_score=lr.predict(x_train)))
print('Test data ROC/AUC :', roc_auc_score(y_true=y_test, y_score=y_pred_lr))
# confusion matrix
print('\nConfusion matrix')
print(confusion_matrix(y_true=y_test, y_pred=y_pred_lr))
# classification matrix
print('\nClassification matrix')
print(classification_report(y_true=y_test, y_pred=y_pred_lr))
Train data ROC/AUC : 0.6468634521568741
Test data ROC/AUC : 0.6458125755654817
Confusion matrix
[[27079 6649]
[11554 11046]]
Classification matrix
precision recall f1-score support
0 0.70 0.80 0.75 33728
1 0.62 0.49 0.55 22600
accuracy 0.68 56328
macro avg 0.66 0.65 0.65 56328
weighted avg 0.67 0.68 0.67 56328
%%time
from sklearn.model_selection import GridSearchCV
grid = {'C': np.logspace(-3,3,7), 'penalty': ['l1', 'l2']}
# Creating the model:
lr = LogisticRegression(solver='liblinear')
# Creating GridSearchCV model:
lr_cv = GridSearchCV(lr, grid, cv=10, scoring='roc_auc') # Using lr model, grid parameters and cross validation of 10 (10 times of accuracy calculation will be applied)
# Training the model:
lr_cv.fit(x_train, y_train)
print('best paremeters for logistic regression with liblinear: ', lr_cv.best_params_)
print('best score for logistic regression after grid search cv:', lr_cv.best_score_)
best paremeters for logistic regression with liblinear: {'C': 1.0, 'penalty': 'l2'}
best score for logistic regression after grid search cv: 0.7278458440411556
Wall time: 15min 24s
lr_tuned = LogisticRegression(solver='liblinear', C=1.0, penalty='l2')
lr_tuned.fit(x_train, y_train)
y_pred_lr = lr_tuned.predict(x_test)
# test data set auc error
print('Train data ROC/AUC :', roc_auc_score(y_true=y_train, y_score=lr_tuned.predict(x_train)))
print('Test data ROC/AUC :', roc_auc_score(y_true=y_test, y_score=y_pred_lr))
# confusion matrix
print('\nConfusion matrix')
print(confusion_matrix(y_true=y_test, y_pred=y_pred_lr))
# classification matrix
print('\nClassification matrix')
print(classification_report(y_true=y_test, y_pred=y_pred_lr))
Train data ROC/AUC : 0.6468634521568741
Test data ROC/AUC : 0.6458125755654817
Confusion matrix
[[27079 6649]
[11554 11046]]
Classification matrix
precision recall f1-score support
0 0.70 0.80 0.75 33728
1 0.62 0.49 0.55 22600
accuracy 0.68 56328
macro avg 0.66 0.65 0.65 56328
weighted avg 0.67 0.68 0.67 56328
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# Compute fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred_lr)
roc_auc = roc_auc_score(y_test, y_pred_lr)
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
<matplotlib.legend.Legend at 0x2a6c754cf70>
We can then use model.predict_proba(x_test)[:,1] to get the probabilities of label being positive for the target.
# get importance
importance = lr_tuned.coef_
# summarize feature importance
sort_list = [(x.columns[i], v) for i,v in enumerate(importance[0])]
sort_list.sort(key=lambda x:x[1])
# plot feature importance
plt.bar([i for i in range(len(sort_list))], [i[1] for i in sort_list])
plt.show()
for i in sort_list:
print('%s: %.5f' % (i[0],i[1]))
cat_full_Technology-Web: -1.71453 cat_full_Technology-Apps: -1.66991 cat_full_Games-Mobile Games: -1.46526 cat_full_Journalism-Video: -1.39119 cat_full_Crafts-Candles: -1.33941 cat_full_Food-Bacon: -1.33810 cat_full_Crafts-DIY: -1.22958 cat_full_Crafts-Embroidery: -1.17718 cat_full_Crafts-Crochet: -1.17300 cat_full_Music-Hip-Hop: -1.16235 cat_full_Photography-Nature: -1.10124 cat_full_Journalism-Photo: -1.05694 cat_full_Photography-Places: -1.05424 cat_full_Food-Farmer's Markets: -1.05318 cat_full_Food-Events: -1.02211 cat_full_Technology-Software: -0.93443 cat_full_Journalism-Web: -0.93288 cat_full_Food-Food Trucks: -0.92030 cat_full_Film & Video-Action: -0.88058 cat_full_Fashion-Jewelry: -0.83463 cat_full_Fashion-Childrenswear: -0.78627 cat_full_Fashion-Couture: -0.77840 cat_full_Publishing-Young Adult: -0.77808 cat_full_Fashion-Ready-to-wear: -0.76492 cat_full_Photography-People: -0.74823 cat_full_Music-R&B: -0.74178 cat_full_Fashion-Apparel: -0.73162 cat_full_Crafts-Woodworking: -0.71396 cat_full_Crafts-Crafts: -0.70795 cat_full_Film & Video-Television: -0.69989 cat_full_Games-Live Games: -0.62705 usd_goal_corrected: -0.60201 country_IT: -0.59630 cat_full_Crafts-Quilts: -0.59579 cat_full_Publishing-Academic: -0.59180 cat_full_Publishing-Fiction: -0.57108 cat_full_Design-Interactive Design: -0.56981 cat_full_Photography-Animals: -0.55697 cat_full_Crafts-Weaving: -0.54599 cat_full_Crafts-Glass: -0.54051 cat_full_Art-Digital Art: -0.53963 cat_full_Crafts-Printing: -0.53699 cat_full_Food-Restaurants: -0.53504 cat_full_Journalism-Audio: -0.51288 cat_full_Food-Community Gardens: -0.50704 cat_full_Art-Video Art: -0.46109 cat_full_Film & Video-Family: -0.45549 cat_full_Journalism-Print: -0.44977 cat_full_Technology-Flight: -0.42039 cat_full_Fashion-Fashion: -0.40441 cat_full_Food-Farms: -0.40236 cat_full_Food-Cookbooks: -0.39804 cat_full_Film & Video-Experimental: -0.39282 cat_full_Publishing-Nonfiction: -0.37255 country_AT: -0.36524 cat_full_Food-Small Batch: -0.36433 cat_full_Technology-Technology: -0.35116 cat_full_Art-Textiles: -0.33499 country_NL: -0.30660 cat_full_Journalism-Journalism: -0.30573 country_NO: -0.30400 country_ES: -0.29748 cat_full_Crafts-Stationery: -0.27852 cat_full_Games-Games: -0.26604 month_7: -0.26586 cat_full_Games-Video Games: -0.26111 cat_full_Publishing-Translations: -0.25421 cat_full_Food-Drinks: -0.25133 cat_full_Publishing-Poetry: -0.23221 cat_full_Art-Painting: -0.22980 cat_full_Publishing-Publishing: -0.22140 cat_full_Fashion-Pet Fashion: -0.20958 cat_full_Comics-Events: -0.20605 cat_full_Music-Electronic Music: -0.19065 cat_full_Dance-Workshops: -0.18918 cat_full_Design-Architecture: -0.17246 month_8: -0.16879 country_DE: -0.16389 country_BE: -0.16039 cat_full_Games-Gaming Hardware: -0.15757 cat_full_Photography-Photography: -0.15558 cat_full_Art-Mixed Media: -0.15548 cat_full_Food-Food: -0.15423 month_1: -0.14945 cat_full_Design-Graphic Design: -0.14624 month_12: -0.13319 name_non_chars: -0.12255 cat_full_Art-Conceptual Art: -0.10551 country_HK: -0.10249 duration: -0.09232 cat_full_Film & Video-Webseries: -0.08261 country_IE: -0.07953 country_AU: -0.07619 country_CH: -0.07051 cat_full_Publishing-Children's Books: -0.06712 country_MX: -0.05275 cat_full_Fashion-Accessories: -0.04693 cat_full_Photography-Fine Art: -0.04689 cat_full_Film & Video-Music Videos: -0.04669 month_6: -0.04390 cat_full_Film & Video-Thrillers: -0.03300 cat_full_Film & Video-Animation: -0.03201 month_9: -0.01836 cat_full_Crafts-Taxidermy: -0.01616 month_11: -0.00674 month_5: -0.00063 country_CA: 0.00061 month_10: 0.00802 month_2: 0.00972 cat_full_Technology-Fabrication Tools: 0.00979 country_SE: 0.01120 month_4: 0.02061 month_3: 0.02700 cat_full_Film & Video-Horror: 0.04823 cat_full_Film & Video-Film & Video: 0.07188 country_NZ: 0.07627 cat_full_Film & Video-Movie Theaters: 0.08053 name_has_symbol: 0.08074 cat_full_Music-Metal: 0.08364 cat_full_Music-Latin: 0.09738 cat_full_Design-Design: 0.12884 cat_full_Film & Video-Romance: 0.13410 cat_full_Fashion-Footwear: 0.13517 cat_full_Food-Vegan: 0.14005 cat_full_Art-Art: 0.14914 cat_full_Food-Spaces: 0.15762 cat_full_Dance-Spaces: 0.16240 cat_full_Film & Video-Fantasy: 0.16297 country_DK: 0.16648 cat_full_Art-Sculpture: 0.20047 name_upper: 0.20463 country_GB: 0.20984 cat_full_Publishing-Calendars: 0.21214 cat_full_Music-Punk: 0.21696 cat_full_Games-Puzzles: 0.22328 cat_full_Art-Performance Art: 0.23715 cat_full_Technology-Gadgets: 0.24128 cat_full_Publishing-Radio & Podcasts: 0.24825 cat_full_Art-Ceramics: 0.25001 cat_full_Film & Video-Comedy: 0.26558 country_FR: 0.28360 cat_full_Technology-Space Exploration: 0.28785 cat_full_Technology-Makerspaces: 0.30825 cat_full_Art-Illustration: 0.30903 cat_full_Publishing-Zines: 0.31109 cat_full_Photography-Photobooks: 0.33442 country_SG: 0.34003 cat_full_Design-Product Design: 0.35438 cat_full_Music-Faith: 0.35736 cat_full_Publishing-Periodicals: 0.37631 country_US: 0.37812 country_LU: 0.38768 len_name: 0.39323 cat_full_Music-Blues: 0.39781 cat_full_Publishing-Literary Journals: 0.41546 cat_full_Crafts-Knitting: 0.41906 cat_full_Technology-Hardware: 0.41955 cat_full_Film & Video-Documentary: 0.42267 cat_full_Film & Video-Drama: 0.43496 cat_full_Games-Playing Cards: 0.43620 cat_full_Design-Civic Design: 0.43744 cat_full_Music-World Music: 0.44756 cat_full_Music-Pop: 0.48144 cat_full_Technology-3D Printing: 0.50419 cat_full_Film & Video-Festivals: 0.50880 cat_full_Technology-Robots: 0.51376 cat_full_Music-Music: 0.51571 cat_full_Technology-Wearables: 0.53117 cat_full_Film & Video-Narrative Film: 0.54874 cat_full_Technology-DIY Electronics: 0.55686 cat_full_Film & Video-Science Fiction: 0.58022 cat_full_Music-Rock: 0.58286 cat_full_Comics-Comics: 0.63514 cat_full_Music-Kids: 0.66428 cat_full_Art-Public Art: 0.66524 cat_full_Art-Installations: 0.66662 cat_full_Publishing-Art Books: 0.68980 cat_full_Technology-Sound: 0.73211 cat_full_Crafts-Pottery: 0.76961 cat_full_Theater-Musical: 0.79173 cat_full_Theater-Experimental: 0.84105 cat_full_Film & Video-Shorts: 0.86846 cat_full_Comics-Graphic Novels: 0.87065 cat_full_Publishing-Anthologies: 0.87343 cat_full_Theater-Immersive: 0.88126 cat_full_Theater-Plays: 0.91361 cat_full_Comics-Webcomics: 0.94256 cat_full_Comics-Comic Books: 0.94655 cat_full_Theater-Spaces: 0.96437 cat_full_Theater-Festivals: 0.99423 cat_full_Technology-Camera Equipment: 1.00353 cat_full_Dance-Performances: 1.00502 cat_full_Music-Jazz: 1.03557 cat_full_Theater-Theater: 1.04867 cat_full_Crafts-Letterpress: 1.08059 cat_full_Music-Country & Folk: 1.09114 cat_full_Music-Indie Rock: 1.09831 cat_full_Music-Classical Music: 1.13767 cat_full_Music-Chiptune: 1.21048 cat_full_Dance-Dance: 1.21385 cat_full_Games-Tabletop Games: 1.22086 cat_full_Design-Typography: 1.23384 cat_full_Dance-Residencies: 1.36250 cat_full_Comics-Anthologies: 1.83013
3) From what we have observed through EDA (I didn't leave all my code for this part here.) mostly, it seems better to do a project in:
print("The most promising categories to start a kickstarter in are:",", ".join(list(more_success_than_failed.keys())))
The most promising categories to start a kickstarter in are: Music-Music, Film & Video-Shorts, Games-Tabletop Games, Music-Rock, Theater-Theater, Comics-Comics, Music-Indie Rock, Music-Pop, Music-Country & Folk, Art-Public Art, Art-Illustration, Publishing-Art Books, Music-Classical Music, Comics-Comic Books, Dance-Dance, Music-Jazz, Comics-Graphic Novels, Theater-Plays, Theater-Musical, Dance-Performances, Technology-DIY Electronics, Comics-Webcomics, Art-Installations, Theater-Festivals, Technology-Camera Equipment, Theater-Experimental, Music-Punk, Theater-Immersive, Film & Video-Festivals, Publishing-Literary Journals, Publishing-Anthologies, Theater-Spaces, Crafts-Knitting, Comics-Anthologies, Crafts-Pottery, Design-Typography, Dance-Residencies, Crafts-Letterpress, Music-Chiptune
Furthermore, it seems that projects with a duration of days below one month have better chances of success.
I think our study is incomplete because we are not studying the potential creators and backers interactions towards the project, the comments, number of shares throughout the web are what make the success of a kickstarter project aiming towards a reasonably high amount of money, by targetting the right people and generating contributions to the project in the alloted timeline. We can see that amongst the most successful categories, the mean usd_goal between failed and successful projects is different, failed projects tend to have higher amounts of money as a goal, thus, by keeping the goal similar to previously successful projects in the same domain, the chances to see the project succeed are better.
The factors of success of a project go far beyond what we have as a dataset in this study, as the real issue seems to be how people find these projects. Kickstarter is above all the hosting platform to receive these funds. However, it is interesting to see that we were able to detect some interesting insights and finish up with a final model that has around 68% accuracy.